Python Standard Library Analysis¶
InĀ [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from analysis_utils import add_complexity_to_metadata, prepare_std_lib_treemap_data, create_std_lib_treemap, plot_usage_in_files, plot_usage_within_files, plot_mean_complexity
InĀ [2]:
metadata = pd.read_parquet('/workspaces/repos/randomstats/github/metadata.parquet').dropna()
metadata = add_complexity_to_metadata(metadata)
metadata
Out[2]:
| calls | assignments | attributes | size | is_ipynb | complexity | |
|---|---|---|---|---|---|---|
| chunk_id | ||||||
| 4444d3cdff850ebe6ce372c7b867141a2d041f11 | 31.0 | 18.0 | 24.0 | 2176.0 | False | 0.172107 |
| 92eea3f82b5e926c9a8d6568a2b3eb351199bf9e | 3.0 | 2.0 | 4.0 | 590.0 | False | 0.024374 |
| d8108f44778865705e71acaf17442a499afbc5d0 | 0.0 | 1.0 | 0.0 | 2278.0 | False | 0.031695 |
| e66ae67333542402587a359a3a84554c27444aad | 6.0 | 6.0 | 4.0 | 886.0 | False | 0.046665 |
| 1d6cb5c0c1a8f8d441bb85db936dd202ae3dc58f | 6.0 | 4.0 | 13.0 | 876.0 | False | 0.051324 |
| ... | ... | ... | ... | ... | ... | ... |
| 79e8c8fcb9d1293b01a25327c5193a3ec2149dd7 | 8.0 | 5.0 | 5.0 | 540.0 | False | 0.043811 |
| 55d9448fc5f4015cf76f7246de39ce8424870ee0 | 0.0 | 0.0 | 0.0 | 3984.0 | True | 0.049601 |
| 8bd51a5a60142e8e839697d0ead4cdaa2a16d64c | 39.0 | 14.0 | 79.0 | 3727.0 | False | 0.262118 |
| 791f8251863b3a09b1cf8f4d16e56933b0830e39 | 35.0 | 26.0 | 24.0 | 6548.0 | False | 0.260235 |
| 5c2402c51df4f30da18b00dcbfd8d1899348d48a | 23.0 | 6.0 | 18.0 | 2633.0 | False | 0.116104 |
7484750 rows Ć 6 columns
InĀ [3]:
libraries = pd.read_parquet('/workspaces/repos/randomstats/github/library_counts.parquet')
libraries
Out[3]:
| chunk_id | library_name | component_type | component | count | |
|---|---|---|---|---|---|
| 0 | 284c4fa678838e17f66eed0db60ea67353fad38d | math | from_import_function | ceil | 2 |
| 1 | 284c4fa678838e17f66eed0db60ea67353fad38d | warnings | function | warn | 1 |
| 2 | 89c3b09cdaa74e55dc6241088e690a7cbf0dfe10 | ctypes | class | c_byte | 2 |
| 3 | 89c3b09cdaa74e55dc6241088e690a7cbf0dfe10 | ctypes | class | CDLL | 2 |
| 4 | 89c3b09cdaa74e55dc6241088e690a7cbf0dfe10 | ctypes | class | c_void_p | 47 |
| ... | ... | ... | ... | ... | ... |
| 125612 | 738e07780b1324c00aba152966305aaa0cc53ca4 | os | attribute | path | 1 |
| 125613 | 738e07780b1324c00aba152966305aaa0cc53ca4 | pdb | from_import_function | pm | 1 |
| 125614 | 641836419114f68ecc105425ade25cd7813979a4 | os | attribute | path | 7 |
| 125615 | 641836419114f68ecc105425ade25cd7813979a4 | site | function | addsitedir | 1 |
| 125616 | 0cd9ae3f7372a72618bf2966d1b8c75c618feb1d | math | function | atan2 | 2 |
18056061 rows Ć 5 columns
InĀ [4]:
py_ids = metadata[metadata['is_ipynb'] == False].index
ipynb_ids = metadata[metadata['is_ipynb'] == True].index
libraries_py = libraries[libraries['chunk_id'].isin(py_ids)]
libraries_ipynb = libraries[libraries['chunk_id'].isin(ipynb_ids)]
Most Commonly Used Libraries and Their Components¶
By Number of Files They Were Used in¶
Top Libraries¶
InĀ [5]:
plot_usage_in_files(libraries, top_n=30)
Top os Components¶
InĀ [6]:
plot_usage_in_files(libraries, library_name='os', top_n=30)
Top logging Components¶
InĀ [7]:
plot_usage_in_files(libraries, library_name='logging', top_n=30)
Top re Components¶
InĀ [8]:
plot_usage_in_files(libraries, library_name='re', top_n=30)
Top re Functions¶
InĀ [9]:
plot_usage_in_files(libraries, library_name='re', top_n=18, component_types=['function', 'from_import_function', 'method', 'from_import_method'])
Top datetime Components¶
InĀ [10]:
plot_usage_in_files(libraries, library_name='datetime', top_n=30)
Top time components¶
InĀ [11]:
plot_usage_in_files(libraries, library_name='time', top_n=30)
Top unittest Components¶
InĀ [12]:
plot_usage_in_files(libraries, library_name='unittest', top_n=30)
Top json Components¶
InĀ [13]:
plot_usage_in_files(libraries, library_name='json', top_n=30)
Top json Functions¶
InĀ [14]:
plot_usage_in_files(libraries, library_name='json', top_n=30, component_types=['function', 'from_import_function', 'method', 'from_import_method'])
By Number of Their Components Usage¶
InĀ [15]:
df = libraries[['library_name', 'component_type', 'component', 'count']]
df_grouped = df.groupby(['library_name', 'component_type'])['count'].sum().reset_index()
df_pivot = df_grouped.pivot(index='library_name', columns='component_type', values='count').fillna(0).astype(int)
df_pivot['all_components'] = df_pivot.sum(axis=1)
columns_with_from_import = [col for col in df_pivot.columns if 'from_import' in col]
columns_without_from_import = [col for col in df_pivot.columns if 'from_import' not in col and col != 'all_components']
sorted_columns = ['all_components'] + sorted(columns_without_from_import) + sorted(columns_with_from_import)
df_pivot = df_pivot.reindex(sorted_columns, axis=1)
df_sorted = df_pivot.sort_values('all_components', ascending=False)
InĀ [16]:
df_sorted.head(50)
Out[16]:
| component_type | all_components | attribute | class | exception | function | method | from_import_attribute | from_import_class | from_import_exception | from_import_function | from_import_method |
|---|---|---|---|---|---|---|---|---|---|---|---|
| library_name | |||||||||||
| os | 10336370 | 6990569 | 2910 | 4667 | 1551778 | 1572969 | 139821 | 447 | 80 | 69547 | 3582 |
| unittest | 9175504 | 112813 | 507738 | 7999 | 229315 | 8200405 | 132 | 88268 | 3659 | 24960 | 215 |
| re | 8514570 | 3209901 | 70 | 5995 | 1349108 | 3920251 | 601 | 9 | 67 | 17535 | 11033 |
| logging | 6805991 | 5879 | 60291 | 0 | 1062875 | 5634894 | 3 | 4243 | 0 | 26282 | 11524 |
| datetime | 4506318 | 438891 | 754423 | 0 | 0 | 1852843 | 20778 | 1192344 | 0 | 0 | 247039 |
| json | 1688985 | 391284 | 9716 | 1981 | 907038 | 354738 | 30 | 2140 | 879 | 21151 | 28 |
| threading | 1530820 | 183843 | 176859 | 190 | 10857 | 1081341 | 288 | 75067 | 13 | 1580 | 782 |
| collections | 1484301 | 90049 | 82680 | 0 | 30907 | 775558 | 672 | 406161 | 0 | 97624 | 650 |
| ctypes | 1287061 | 130028 | 238717 | 352 | 94672 | 4433 | 34262 | 526003 | 576 | 257480 | 538 |
| argparse | 1240415 | 0 | 159320 | 0 | 0 | 1049107 | 0 | 31927 | 0 | 0 | 61 |
| time | 1217957 | 0 | 1838 | 0 | 981814 | 0 | 0 | 529 | 0 | 233776 | 0 |
| typing | 1113545 | 3612 | 50917 | 0 | 2717 | 0 | 4084 | 1013348 | 0 | 38867 | 0 |
| subprocess | 1030862 | 413562 | 116502 | 32557 | 183976 | 177905 | 7709 | 35638 | 15512 | 47275 | 226 |
| struct | 972777 | 134911 | 10127 | 13573 | 319153 | 390757 | 1654 | 2846 | 992 | 53751 | 45013 |
| io | 923506 | 195677 | 88466 | 1699 | 27977 | 484731 | 286 | 112937 | 725 | 10803 | 205 |
| socket | 885323 | 47904 | 94483 | 55122 | 69795 | 596876 | 1163 | 8739 | 3581 | 6420 | 1240 |
| os.path | 856853 | 0 | 0 | 0 | 503060 | 0 | 0 | 0 | 0 | 353793 | 0 |
| xml.etree.ElementTree | 798467 | 163643 | 42717 | 0 | 141133 | 431419 | 490 | 6558 | 0 | 11212 | 1295 |
| math | 628739 | 0 | 0 | 0 | 394531 | 0 | 0 | 0 | 0 | 234208 | 0 |
| traceback | 616836 | 89487 | 420 | 0 | 122110 | 396352 | 29 | 61 | 0 | 8119 | 258 |
| sys | 557172 | 0 | 0 | 0 | 542371 | 0 | 0 | 0 | 0 | 14801 | 0 |
| random | 496583 | 0 | 11689 | 0 | 365880 | 0 | 0 | 3813 | 0 | 115201 | 0 |
| multiprocessing | 470704 | 32494 | 18270 | 214 | 13825 | 364191 | 36 | 20483 | 337 | 10308 | 10546 |
| codecs | 440154 | 115979 | 6669 | 0 | 52704 | 248614 | 1237 | 93 | 0 | 13666 | 1192 |
| inspect | 438718 | 277186 | 6270 | 0 | 99258 | 33848 | 2747 | 1362 | 0 | 18044 | 3 |
| types | 438023 | 48545 | 10221 | 0 | 981 | 363591 | 425 | 10301 | 0 | 320 | 3639 |
| functools | 414767 | 82120 | 507 | 0 | 108749 | 0 | 398 | 757 | 0 | 222236 | 0 |
| uuid | 412839 | 166037 | 24489 | 0 | 172427 | 0 | 0 | 14657 | 0 | 35229 | 0 |
| sqlite3 | 362217 | 16878 | 5074 | 8875 | 26351 | 303466 | 28 | 219 | 857 | 457 | 12 |
| unittest.mock | 345158 | 16259 | 8686 | 0 | 13513 | 6966 | 44 | 110626 | 0 | 189064 | 0 |
| zipfile | 341482 | 99540 | 30376 | 1530 | 2479 | 196918 | 298 | 8851 | 1030 | 386 | 74 |
| copy | 328723 | 0 | 0 | 109 | 198820 | 0 | 0 | 0 | 29 | 129765 | 0 |
| hashlib | 318275 | 110591 | 0 | 0 | 4466 | 202909 | 2 | 0 | 0 | 300 | 7 |
| warnings | 278173 | 0 | 43632 | 0 | 200634 | 0 | 0 | 6801 | 0 | 27106 | 0 |
| itertools | 265484 | 0 | 0 | 0 | 104289 | 7868 | 0 | 0 | 0 | 153327 | 0 |
| shutil | 264450 | 7 | 0 | 1660 | 226427 | 0 | 0 | 0 | 353 | 36003 | 0 |
| abc | 253434 | 0 | 27018 | 0 | 91771 | 11499 | 0 | 39307 | 0 | 83837 | 2 |
| pickle | 253033 | 1081 | 1420 | 1858 | 93401 | 149152 | 3 | 486 | 641 | 3379 | 1612 |
| decimal | 234633 | 0 | 29287 | 0 | 674 | 48346 | 0 | 150059 | 0 | 1616 | 4651 |
| pprint | 217857 | 0 | 2667 | 0 | 23172 | 78436 | 0 | 531 | 0 | 57152 | 55899 |
| string | 217458 | 9929 | 11417 | 0 | 857 | 182018 | 26 | 12797 | 0 | 308 | 106 |
| tempfile | 212538 | 0 | 12021 | 0 | 153519 | 0 | 0 | 3744 | 0 | 43254 | 0 |
| pathlib | 211430 | 29568 | 21836 | 0 | 0 | 60770 | 61 | 99052 | 0 | 0 | 143 |
| configparser | 211065 | 30 | 11853 | 4129 | 0 | 185958 | 0 | 7076 | 2005 | 0 | 14 |
| ast | 205507 | 12927 | 134567 | 0 | 23530 | 19099 | 494 | 9345 | 0 | 5379 | 166 |
| tarfile | 192497 | 46540 | 3712 | 1512 | 17382 | 122712 | 0 | 549 | 58 | 30 | 2 |
| operator | 188231 | 0 | 0 | 0 | 100192 | 0 | 0 | 0 | 0 | 88039 | 0 |
| urllib.request | 162729 | 61372 | 7874 | 0 | 18085 | 54859 | 598 | 6150 | 0 | 13630 | 161 |
| optparse | 148828 | 16948 | 15624 | 0 | 0 | 88688 | 857 | 26709 | 0 | 0 | 2 |
| csv | 137084 | 14689 | 19792 | 1732 | 49770 | 47751 | 84 | 1973 | 106 | 1171 | 16 |
InĀ [17]:
plot_usage_within_files(df_sorted, ['class', 'function', 'method', 'attribute', 'exception'], top_n=30, number_format='M')
InĀ [18]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
def create_std_lib_treemap(component_usage, title, width=1200, height=1200, text_size=18):
"""
Create a treemap visualization of standard library usage.
Args:
component_usage (pd.DataFrame): The DataFrame containing the prepared data for the treemap.
title (str): The title of the treemap.
Returns:
None: The function displays the treemap using Plotly.
"""
hovertemplate = "%{label}<br>Count: %{customdata[0]}<br>Share of library: %{customdata[1]:.2f}%<br>Share of all: %{customdata[2]:.2f}%"
fig = px.treemap(
component_usage,
path=['library_name', 'component'],
values='count',
color='library_percentage',
custom_data=['count', 'library_percentage', 'total_percentage'],
title=title,
color_continuous_scale='RdBu',
labels={'library_name': 'Library',
'component': 'Component',
'count': 'Count'}
)
fig.update_traces(hovertemplate=hovertemplate, textinfo='label+value+percent parent')
fig.update_layout(
width=width,
height=height,
uniformtext=dict(minsize=text_size, mode='show'),
title={
'text': title,
'font': {'size': 30},
'x': 0.5,
'y': 0.98,
'xanchor': 'center',
'yanchor': 'top'
},
margin=dict(l=10, r=10, t=100, b=10)
)
fig.show()
Components in All Files¶
InĀ [19]:
component_usage_sum = prepare_std_lib_treemap_data(libraries, threshold=0.003)
create_std_lib_treemap(component_usage_sum, "Python Standard Library Modules Usage", width=1800, height=1400, text_size=10)
Components in .py Files¶
InĀ [20]:
component_usage_sum = prepare_std_lib_treemap_data(libraries_py)
create_std_lib_treemap(component_usage_sum, "Python Standard Library Components Usage in .py Files", width=1800, height=1400, text_size=10)
Components in .ipynb files¶
InĀ [21]:
component_usage_sum = prepare_std_lib_treemap_data(libraries_ipynb)
create_std_lib_treemap(component_usage_sum, "Python Standard Library Components Usage in .ipynb Files", width=1800, height=1400, text_size=10)
Components in Files with os¶
InĀ [22]:
os_module = libraries_py[libraries_py['library_name'] == 'os']
component_usage_sum = prepare_std_lib_treemap_data(os_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "os Component Usage", width=1800, height=1400, text_size=10)
Components in Files with unittest¶
InĀ [23]:
unittest_module = libraries_py[libraries_py['library_name'] == 'unittest']
component_usage_sum = prepare_std_lib_treemap_data(unittest_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "unittest Component Usage", width=1800, height=1400, text_size=10)
Components in Files with re¶
InĀ [24]:
re_module = libraries_py[libraries_py['library_name'] == 're']
component_usage_sum = prepare_std_lib_treemap_data(re_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "re Component Usage", width=1800, height=1400, text_size=10)
Components in Files with logging¶
InĀ [25]:
logging_module = libraries_py[libraries_py['library_name'] == 'logging']
component_usage_sum = prepare_std_lib_treemap_data(logging_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "logging Component Usage", width=1800, height=1400, text_size=10)
Components in Files with datetime¶
InĀ [26]:
datetime_module = libraries_py[libraries_py['library_name'] == 'datetime']
component_usage_sum = prepare_std_lib_treemap_data(datetime_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "datetime Component Usage", width=1800, height=1400, text_size=10)
Components in Files with json¶
InĀ [27]:
json_module = libraries_py[libraries_py['library_name'] == 'json']
component_usage_sum = prepare_std_lib_treemap_data(json_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "json Component Usage", width=1800, height=1400, text_size=10)
Components in Files with threading¶
InĀ [28]:
threading_module = libraries_py[libraries_py['library_name'] == 'threading']
component_usage_sum = prepare_std_lib_treemap_data(threading_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "threading Component Usage", width=1800, height=1400, text_size=10)
Components in Files with collections¶
InĀ [29]:
collections_module = libraries_py[libraries_py['library_name'] == 'collections']
component_usage_sum = prepare_std_lib_treemap_data(collections_module, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "collections Component Usage", width=1800, height=1400, text_size=10)
Classes in Files with collections¶
InĀ [30]:
component_types = 'from_import_class', 'class'
collections_module = libraries_py[libraries_py['library_name'] == 'collections']
component_usage_sum = prepare_std_lib_treemap_data(collections_module, component_types, threshold=0.001)
create_std_lib_treemap(component_usage_sum, "collections Class Usage", width=1800, height=1400, text_size=10)
Components in Files with ctypes (blogpost version)¶
InĀ [31]:
ctypes_module = libraries_py[libraries_py['library_name'] == 'ctypes']
component_usage_sum = prepare_std_lib_treemap_data(ctypes_module, threshold=0.01)
create_std_lib_treemap(component_usage_sum, "ctypes Component Usage")
Components in Files with shutil (blogpost version)¶
InĀ [32]:
ctypes_module = libraries_py[libraries_py['library_name'] == 'shutil']
component_usage_sum = prepare_std_lib_treemap_data(ctypes_module, threshold=0.01)
create_std_lib_treemap(component_usage_sum, "shutil Component Usage")
Components in Files with os (blogpost version)¶
InĀ [33]:
ctypes_module = libraries_py[libraries_py['library_name'] == 'os']
component_usage_sum = prepare_std_lib_treemap_data(ctypes_module, threshold=0.01)
create_std_lib_treemap(component_usage_sum, "os Component Usage")
Components in Files with unittest (blogpost version)¶
InĀ [34]:
ctypes_module = libraries_py[libraries_py['library_name'] == 'unittest']
component_usage_sum = prepare_std_lib_treemap_data(ctypes_module, threshold=0.01)
create_std_lib_treemap(component_usage_sum, "unittest Component Usage")
Library Co-Occurrence¶
By Number of Library Components Used¶
InĀ [35]:
libraries_counts = libraries[['chunk_id', 'library_name', 'count']].groupby(['chunk_id', 'library_name'])['count'].sum().reset_index()
libraries_counts_pivot = libraries_counts.pivot_table(index='chunk_id', columns='library_name', values='count', fill_value=0)
libraries_counts_pivot
Out[35]:
| library_name | _thread | abc | aifc | argparse | array | ast | asynchat | asyncore | atexit | audioop | ... | xml.sax.handler | xml.sax.saxutils | xml.sax.xmlreader | xmlrpc.client | xmlrpc.server | zipapp | zipfile | zipimport | zlib | zoneinfo |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| chunk_id | |||||||||||||||||||||
| 0000081255acf04f13c1c84f4e86f7410a5bd792 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 00000eec027d380439a62cf403242855f96f3867 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0000136d1c63e78b3ab59641a7146a60da2c919e | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0000138611e4779ad8f368689b4c28f7a6a6ee1a | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 000014207524492ac0b6c407d99d481d00e3dc3c | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ffffe8b423a586793ee52aa7099e20f8a0758e7b | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ffffeb99a75ba02cc350f3fb587cf05aaf8543fa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ffffebdb4ab741c9f40872a832bac4d66d879522 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| fffff44a72b9d75ae9d62414d15f2e3d61c4b7c5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| fffff95467ad0cd33bdb7f041a99a44ec3649538 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3107215 rows Ć 221 columns
InĀ [36]:
column_percents_by_componentcount = libraries_counts_pivot.sum() / libraries_counts_pivot.sum().sum()
libraries_counts_pivot_above_1_prc_by_componentcount = libraries_counts_pivot.loc[:, column_percents_by_componentcount > 0.0063] # more like 0.6%
correlation_matrix_above_1_prc_by_componentcount = libraries_counts_pivot_above_1_prc_by_componentcount.corr()
libraries_counts_pivot_above_01_prc_by_componentcount = libraries_counts_pivot.loc[:, column_percents_by_componentcount > 0.001]
correlation_matrix_above_01_prc_by_componentcount = libraries_counts_pivot_above_01_prc_by_componentcount.corr()
InĀ [37]:
mask = np.triu(np.ones_like(correlation_matrix_above_1_prc_by_componentcount, dtype=bool))
plt.figure(figsize=(18, 18))
sns.heatmap(correlation_matrix_above_1_prc_by_componentcount, mask=mask, annot=True, annot_kws={'size': 12}, cmap='coolwarm', linewidths=0.5, fmt='.2f', vmin=-1, vmax=1)
plt.title(r"Correlation Matrix by Component Count")
plt.ylabel("Library Name")
plt.xlabel("Library Name")
plt.xticks(rotation=90, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.show()
InĀ [38]:
mask = np.triu(np.ones_like(correlation_matrix_above_01_prc_by_componentcount, dtype=bool))
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix_above_01_prc_by_componentcount, mask=mask, annot=True, annot_kws={'size': 8}, cmap='coolwarm', linewidths=0.5, fmt='.1f', vmin=-1, vmax=1)
plt.title(r"Correlation Matrix by Component Count")
plt.ylabel("Library Name")
plt.xlabel("Library Name")
plt.xticks(rotation=90, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.show()
By Number of Files in Which Libraries Were Used¶
InĀ [39]:
libraries_small = libraries[['chunk_id', 'library_name']].drop_duplicates()
libraries_pivot = libraries_small.pivot_table(index='chunk_id', columns='library_name', aggfunc=len, fill_value=0)
libraries_binary = libraries_pivot.applymap(lambda x: 1 if x > 0 else 0)
libraries_binary
Out[39]:
| library_name | _thread | abc | aifc | argparse | array | ast | asynchat | asyncore | atexit | audioop | ... | xml.sax.handler | xml.sax.saxutils | xml.sax.xmlreader | xmlrpc.client | xmlrpc.server | zipapp | zipfile | zipimport | zlib | zoneinfo |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| chunk_id | |||||||||||||||||||||
| 0000081255acf04f13c1c84f4e86f7410a5bd792 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 00000eec027d380439a62cf403242855f96f3867 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0000136d1c63e78b3ab59641a7146a60da2c919e | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 0000138611e4779ad8f368689b4c28f7a6a6ee1a | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 000014207524492ac0b6c407d99d481d00e3dc3c | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ffffe8b423a586793ee52aa7099e20f8a0758e7b | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ffffeb99a75ba02cc350f3fb587cf05aaf8543fa | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ffffebdb4ab741c9f40872a832bac4d66d879522 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| fffff44a72b9d75ae9d62414d15f2e3d61c4b7c5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| fffff95467ad0cd33bdb7f041a99a44ec3649538 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3107215 rows Ć 221 columns
InĀ [40]:
column_percents_by_filecount = libraries_binary.sum() / libraries_binary.sum().sum()
libraries_binary_above_1_prc_by_filecount = libraries_binary.loc[:, column_percents_by_filecount > 0.01]
correlation_matrix_above_1_prc_by_filecount = libraries_binary_above_1_prc_by_filecount.corr()
libraries_binary_above_01_prc_by_filecount = libraries_binary.loc[:, column_percents_by_filecount > 0.001]
correlation_matrix_above_01_prc_by_filecount = libraries_binary_above_01_prc_by_filecount.corr()
InĀ [41]:
mask = np.triu(np.ones_like(correlation_matrix_above_1_prc_by_filecount, dtype=bool))
plt.figure(figsize=(18, 18))
sns.heatmap(correlation_matrix_above_1_prc_by_filecount, mask=mask, annot=True, annot_kws={'size': 12}, cmap='coolwarm', linewidths=0.5, fmt='.2f', vmin=-1, vmax=1)
plt.title(r"Correlation Matrix by File Count")
plt.ylabel("Library Name")
plt.xlabel("Library Name")
plt.xticks(rotation=90, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.show()
InĀ [42]:
mask = np.triu(np.ones_like(correlation_matrix_above_01_prc_by_filecount, dtype=bool))
plt.figure(figsize=(22, 22))
sns.heatmap(correlation_matrix_above_01_prc_by_filecount, mask=mask, annot=True, annot_kws={'size': 8}, cmap='coolwarm', linewidths=0.5, fmt='.1f', vmin=-1, vmax=1)
plt.title(r"Correlation Matrix by File Count")
plt.ylabel("Library Name")
plt.xlabel("Library Name")
plt.xticks(rotation=90, fontsize=14)
plt.yticks(rotation=0, fontsize=14)
plt.show()
Mean Complexity of Code Files by Library Used in Them¶
InĀ [6]:
plot_mean_complexity(libraries_py, metadata, 10000, 'Mean Complexity of .py Files by Library Used in Them')
InĀ [5]:
plot_mean_complexity(libraries_py, metadata, 2000, 'Mean Complexity of .py Files by Library Used in Them', figsize=(30, 10))
InĀ [6]:
!jupyter nbconvert --to html std_library_analysis.ipynb
[NbConvertApp] Converting notebook std_library_analysis.ipynb to html [NbConvertApp] WARNING | Alternative text is missing on 17 image(s). [NbConvertApp] Writing 7778604 bytes to std_library_analysis.html